From the presivous analysis result, we try to find the pure relation between colorfullness/complexity and rating without considering user’s gender, age, education’ effects towards to model. Thus, we will regroup the dataset and then do the fitted linear regression model. Except, that, we will choose some web imgs as compare test after the model is finished. The imgs are eng(346, 347, 348, 349), webby(18, 19). grayscale(18, 19)…
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2
train_dat <- read.table("/Volumes/STF/Final_siemens/Research_Qi/Test4/train_list_color_complx.txt", quote="\"")
test_dat <- read.table("/Volumes/STF/Final_siemens/Research_Qi/Test4/test_list_color_complx.txt", quote="\"")
web <- c(train_dat$V1)
rate <- c(train_dat$V2)
color <- c(train_dat$V3)
compl <- c(train_dat$V4)
dat_tarin <- data.frame(rate, color, compl)
# analysis rate itself
par(mfrow=c(1, 2))
qqnorm(rate)
qqline(rate, col = 2)
hist(rate)
hist((compl)^0.4)
hist(color)
hist(color^0.4)
#qqnorm(rate, color)
par(mfrow=c(1, 2))
qqplot(color, rate, xlab = "colorfullness", ylab = "mean rate value")
qqplot(compl, rate, xlab = "complextymodel", ylab = "mean rate value")
# Individual Pearson correlation (rate, colorfulness, complexity)
df <- data.frame(rate, color, compl)
plot(df[,1:3])
cor(df[, 1:3], method = "pearson")
## rate color compl
## rate 1.0000000 0.1981562 -0.1795888
## color 0.1981562 1.0000000 0.4352672
## compl -0.1795888 0.4352672 1.0000000
Normal Linear Regression (colorfulness + complexity)
lm1 <- lm(rate ~ color + compl)
summary(lm1)
##
## Call:
## lm(formula = rate ~ color + compl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.83377 -0.50135 0.04571 0.60210 2.36165
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.46151 0.24011 18.581 < 2e-16 ***
## color 0.23815 0.03962 6.011 4.83e-09 ***
## compl -0.28257 0.04887 -5.782 1.69e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9639 on 335 degrees of freedom
## Multiple R-squared: 0.1265, Adjusted R-squared: 0.1212
## F-statistic: 24.25 on 2 and 335 DF, p-value: 1.463e-10
anova(lm1)
## Analysis of Variance Table
##
## Response: rate
## Df Sum Sq Mean Sq F value Pr(>F)
## color 1 13.991 13.9910 15.058 0.0001255 ***
## compl 1 31.067 31.0669 33.437 1.688e-08 ***
## Residuals 335 311.257 0.9291
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pearsonn's correlation (compare itself)
par(mfrow=c(1, 1))
resid.nlm <- resid(lm1)
plot(resid.nlm + rate, rate, ylab = "real rate", xlab = "nlm.model estimate")
cor.test(resid.nlm+rate, rate)
##
## Pearson's product-moment correlation
##
## data: resid.nlm + rate and rate
## t = 103.3291, df = 336, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9809903 0.9875723
## sample estimates:
## cor
## 0.984627
## Test
rate <- test_dat$V2
color <- test_dat$V3
compl <- test_dat$V4
cnn <- test_dat$V5
mytest <- data.frame(rate, color, compl)
estimate <- predict(lm1 , newdata = mytest, se.fit = TRUE, interval = "prediction")
predi_value <- estimate$fit[,1]
par(mfrow=c(1, 2))
qqplot(color, rate)
qqplot(compl, rate)
mytest <- data.frame(rate, color, compl, cnn)
estimate <- predict(lm1 , newdata = mytest)
predi_value <- estimate
par(mfrow=c(1, 1))
cor.test(mytest$rate, predi_value)
##
## Pearson's product-moment correlation
##
## data: mytest$rate and predi_value
## t = 4.3264, df = 96, p-value = 3.713e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2234223 0.5576606
## sample estimates:
## cor
## 0.403936
Linear + Interaction
lm1 <- lm(rate ~ color + compl + color * compl)
summary(lm1)
##
## Call:
## lm(formula = rate ~ color + compl + color * compl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.43933 -0.40221 0.07234 0.55138 1.72819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.98304 0.89330 1.100 0.2739
## color 1.11924 0.19267 5.809 8.51e-08 ***
## compl 0.46013 0.18991 2.423 0.0173 *
## color:compl -0.17826 0.03672 -4.855 4.78e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7964 on 94 degrees of freedom
## Multiple R-squared: 0.3407, Adjusted R-squared: 0.3197
## F-statistic: 16.19 on 3 and 94 DF, p-value: 1.454e-08
anova(lm1)
## Analysis of Variance Table
##
## Response: rate
## Df Sum Sq Mean Sq F value Pr(>F)
## color 1 0.953 0.9533 1.503 0.2233
## compl 1 14.909 14.9091 23.506 4.913e-06 ***
## color:compl 1 14.952 14.9518 23.573 4.778e-06 ***
## Residuals 94 59.622 0.6343
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pearsonn's correlation (compare itself)
par(mfrow=c(1, 1))
resid.nlm <- resid(lm1)
plot(resid.nlm + rate, rate, ylab = "real rate", xlab = "nlm.model estimate")
cor.test(resid.nlm+rate, rate)
##
## Pearson's product-moment correlation
##
## data: resid.nlm + rate and rate
## t = 34.3018, df = 96, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9430459 0.9741126
## sample estimates:
## cor
## 0.9615429
## Test
rate <- test_dat$V2
color <- test_dat$V3
compl <- test_dat$V4
cnn <- test_dat$V5
mytest <- data.frame(rate, color, compl)
estimate <- predict(lm1 , newdata = mytest, se.fit = TRUE, interval = "prediction")
predi_value <- estimate$fit[,1]
par(mfrow=c(1, 2))
qqplot(color, rate)
qqplot(compl, rate)
mytest <- data.frame(rate, color, compl, cnn)
estimate <- predict(lm1 , newdata = mytest)
predi_value <- estimate
par(mfrow=c(1, 1))
cor.test(mytest$rate, predi_value)
##
## Pearson's product-moment correlation
##
## data: mytest$rate and predi_value
## t = 7.0438, df = 96, p-value = 2.821e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4357702 0.7009541
## sample estimates:
## cor
## 0.5837192
Linear + Interaction color reduce
lm1 <- lm(rate ~ compl + color * compl)
summary(lm1)
##
## Call:
## lm(formula = rate ~ compl + color * compl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.43933 -0.40221 0.07234 0.55138 1.72819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.98304 0.89330 1.100 0.2739
## compl 0.46013 0.18991 2.423 0.0173 *
## color 1.11924 0.19267 5.809 8.51e-08 ***
## compl:color -0.17826 0.03672 -4.855 4.78e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7964 on 94 degrees of freedom
## Multiple R-squared: 0.3407, Adjusted R-squared: 0.3197
## F-statistic: 16.19 on 3 and 94 DF, p-value: 1.454e-08
anova(lm1)
## Analysis of Variance Table
##
## Response: rate
## Df Sum Sq Mean Sq F value Pr(>F)
## compl 1 7.265 7.2648 11.454 0.0010422 **
## color 1 8.598 8.5977 13.555 0.0003866 ***
## compl:color 1 14.952 14.9518 23.573 4.778e-06 ***
## Residuals 94 59.622 0.6343
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pearsonn's correlation (compare itself)
par(mfrow=c(1, 1))
resid.nlm <- resid(lm1)
plot(resid.nlm + rate, rate, ylab = "real rate", xlab = "nlm.model estimate")
cor.test(resid.nlm+rate, rate)
##
## Pearson's product-moment correlation
##
## data: resid.nlm + rate and rate
## t = 34.3018, df = 96, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9430459 0.9741126
## sample estimates:
## cor
## 0.9615429
## Test
rate <- test_dat$V2
color <- test_dat$V3
compl <- test_dat$V4
cnn <- test_dat$V5
mytest <- data.frame(rate, color, compl)
estimate <- predict(lm1 , newdata = mytest, se.fit = TRUE, interval = "prediction")
predi_value <- estimate$fit[,1]
par(mfrow=c(1, 2))
qqplot(color, rate)
qqplot(compl, rate)
mytest <- data.frame(rate, color, compl, cnn)
estimate <- predict(lm1 , newdata = mytest)
predi_value <- estimate
par(mfrow=c(1, 1))
cor.test(mytest$rate, predi_value)
##
## Pearson's product-moment correlation
##
## data: mytest$rate and predi_value
## t = 7.0438, df = 96, p-value = 2.821e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4357702 0.7009541
## sample estimates:
## cor
## 0.5837192
Original 5 factor + interaction 0.62
lm1 <- lm(rate ~ color + compl + I(color^(exp(1))) + I(compl^(0.4)) + I(abs(color^3 - compl^3)) + color * compl)
summary(lm1)
##
## Call:
## lm(formula = rate ~ color + compl + I(color^(exp(1))) + I(compl^(0.4)) +
## I(abs(color^3 - compl^3)) + color * compl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.49350 -0.38984 0.09987 0.48085 1.75916
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.075801 5.873379 0.524 0.6018
## color 1.494803 0.355571 4.204 6.12e-05 ***
## compl 1.451452 1.620673 0.896 0.3728
## I(color^(exp(1))) 0.003567 0.010599 0.337 0.7372
## I(compl^(0.4)) -3.590708 6.799542 -0.528 0.5987
## I(abs(color^3 - compl^3)) -0.003312 0.002630 -1.259 0.2111
## color:compl -0.263490 0.126954 -2.075 0.0408 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.782 on 91 degrees of freedom
## Multiple R-squared: 0.3846, Adjusted R-squared: 0.3441
## F-statistic: 9.48 on 6 and 91 DF, p-value: 4.451e-08
anova(lm1)
## Analysis of Variance Table
##
## Response: rate
## Df Sum Sq Mean Sq F value Pr(>F)
## color 1 0.953 0.9533 1.5589 0.21503
## compl 1 14.909 14.9091 24.3793 3.568e-06 ***
## I(color^(exp(1))) 1 12.425 12.4252 20.3175 1.947e-05 ***
## I(compl^(0.4)) 1 1.986 1.9856 3.2468 0.07488 .
## I(abs(color^3 - compl^3)) 1 1.878 1.8780 3.0709 0.08307 .
## color:compl 1 2.634 2.6343 4.3076 0.04076 *
## Residuals 91 55.651 0.6116
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pearsonn's correlation (compare itself)
par(mfrow=c(1, 1))
resid.nlm <- resid(lm1)
plot(resid.nlm + rate, rate, ylab = "real rate", xlab = "nlm.model estimate")
cor.test(resid.nlm+rate, rate)
##
## Pearson's product-moment correlation
##
## data: resid.nlm + rate and rate
## t = 32.5322, df = 96, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9371434 0.9713823
## sample estimates:
## cor
## 0.9575154
## Test
rate <- test_dat$V2
color <- test_dat$V3
compl <- test_dat$V4
cnn <- test_dat$V5
mytest <- data.frame(rate, color, compl)
estimate <- predict(lm1 , newdata = mytest, se.fit = TRUE, interval = "prediction")
predi_value <- estimate$fit[,1]
par(mfrow=c(1, 2))
qqplot(color, rate)
qqplot(compl, rate)
mytest <- data.frame(rate, color, compl, cnn)
estimate <- predict(lm1 , newdata = mytest)
predi_value <- estimate
par(mfrow=c(1, 1))
cor.test(mytest$rate, predi_value)
##
## Pearson's product-moment correlation
##
## data: mytest$rate and predi_value
## t = 7.7464, df = 96, p-value = 9.739e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4809596 0.7289148
## sample estimates:
## cor
## 0.6201938
According to QQplot result, the colorfulness and complexity should fit the mean rate value by normal distribution approximately. Thus, We will try two different regression model to see the result: normal linear regression model and fitted linear regression model
n.lm <- lm(rate ~ color + compl + I(color^(exp(1))) + I(compl^(0.4)) + I(abs(color^3 - compl^3)))
summary(n.lm)
##
## Call:
## lm(formula = rate ~ color + compl + I(color^(exp(1))) + I(compl^(0.4)) +
## I(abs(color^3 - compl^3)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.36206 -0.42498 0.08857 0.53424 1.72676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.303775 4.341664 -1.222 0.224980
## color 0.975095 0.256943 3.795 0.000264 ***
## compl -1.632653 0.658419 -2.480 0.014971 *
## I(color^(exp(1))) -0.016101 0.004831 -3.333 0.001240 **
## I(compl^(0.4)) 7.532503 4.259201 1.769 0.080288 .
## I(abs(color^3 - compl^3)) 0.001742 0.001012 1.722 0.088481 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.796 on 92 degrees of freedom
## Multiple R-squared: 0.3555, Adjusted R-squared: 0.3205
## F-statistic: 10.15 on 5 and 92 DF, p-value: 9.183e-08
anova(n.lm)
## Analysis of Variance Table
##
## Response: rate
## Df Sum Sq Mean Sq F value Pr(>F)
## color 1 0.953 0.9533 1.5048 0.22306
## compl 1 14.909 14.9091 23.5332 4.984e-06 ***
## I(color^(exp(1))) 1 12.425 12.4252 19.6124 2.609e-05 ***
## I(compl^(0.4)) 1 1.986 1.9856 3.1341 0.07998 .
## I(abs(color^3 - compl^3)) 1 1.878 1.8780 2.9643 0.08848 .
## Residuals 92 58.285 0.6335
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(2,2))
plot(n.lm)
# Pearsonn's correlation (compare itself)
par(mfrow=c(1, 1))
resid.nlm <- resid(n.lm)
plot(resid.nlm + rate, rate, ylab = "real rate", xlab = "nlm.model estimate")
cor.test(resid.nlm+rate, rate)
##
## Pearson's product-moment correlation
##
## data: resid.nlm + rate and rate
## t = 33.6614, df = 96, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9410073 0.9731707
## sample estimates:
## cor
## 0.9601528
After finish the model, we then use test_list_color_complx.txt to test the model
rate <- test_dat$V2
color <- test_dat$V3
compl <- test_dat$V4
cnn <- test_dat$V5
# color2 = color
#
# for (i in 1:length(color)) {
# if(color[i] < 3.5) {
# color2[i] = color[i] * 0.8
# } else if (color[i] > 7) {
# color2[i] = color[i] * 1.8
# } else {
# color2[i] = color[i]
# }
# }
#
mytest <- data.frame(rate, color, compl)
estimate <- predict(n.lm , newdata = mytest, se.fit = TRUE, interval = "prediction")
predi_value <- estimate$fit[,1]
par(mfrow=c(1, 2))
qqplot(color, rate)
qqplot(compl, rate)
mytest <- data.frame(rate, color, compl, cnn)
estimate <- predict(n.lm , newdata = mytest)
predi_value <- estimate
par(mfrow=c(1, 1))
cor.test(mytest$rate, predi_value)
##
## Pearson's product-moment correlation
##
## data: mytest$rate and predi_value
## t = 7.277, df = 96, p-value = 9.308e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4512086 0.7105991
## sample estimates:
## cor
## 0.5962478
final_data <- data.frame(test_dat, predi_value)
After the linear model is done, we will compare the test result with CNN test result and real rate using Person table.
compare_table <- data.frame(mytest$rate, test_dat$V5, predi_value)
hist(mytest$rate)
plot(compare_table[,1:3])
cor(compare_table[, 1:3], method = "pearson")
## mytest.rate test_dat.V5 predi_value
## mytest.rate 1.0000000 0.8492508 0.5962478
## test_dat.V5 0.8492508 1.0000000 0.5353826
## predi_value 0.5962478 0.5353826 1.0000000
# CNN predict
plot(test_dat$V5, mytest$rate, xlim=c(2, 7), ylim=c(2, 7), ylab = "Real Rate", xlab = "CNN predict rate")
p1 <- ggplot(test_dat, aes(x = test_dat$V2, y = test_dat$V5)) + geom_point(colour = "#48b9f3", size = 3, alpha = 1)
p1 + geom_smooth(method = lm, se = FALSE) + labs(x = "User Aesthetics Rating", y = "CNN Predict Rating") + xlim(2.2, 6.5) + ylim(2.2, 6.5) + theme_bw()
## Warning: Removed 2 rows containing missing values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
# linear regression
plot(predi_value, mytest$rate, xlim=c(2, 7), ylim=c(2, 7), ylab = "Real Rate", xlab = "regression predict rate")
new_dat <- data.frame(test_dat, predi_value)
p2 <- ggplot(new_dat, aes(x = new_dat$V2, y = predi_value)) + geom_point(colour = "#48b9f3", size = 3, alpha = 1)
p2 + geom_smooth(method = lm, se = FALSE) + labs(x = "User Aesthetics Rating", y = "Linear Predict Rating") + xlim(2.2, 6.5) + ylim(2.2, 6.5) + theme_bw()
## Warning: Removed 1 rows containing missing values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
# linear regression vs cnn
plot(predi_value, test_dat$V5 , xlim=c(2, 7), ylim=c(2, 7), ylab = "CNN predict rate", xlab = "regression predict
rate")
true_rate = new_dat$V2
cnn_rate = new_dat$V5
linear_rate = predi_value
dat <- data.frame(xx =c(true_rate, cnn_rate, linear_rate), yy = rep(letters[1:3], each = length(cnn_rate)))
## Histogram user aesthetics rating
p4 <- ggplot(dat, aes(x = xx)) +
geom_histogram(data = subset(dat, yy == 'a'), aes(y = ..density..), binwidth=.6, colour="black", fill = "white", alpha = 0.5) +
geom_density(data = subset(dat, yy == 'a'), fill = "#f7746c", colour = "#f7746c", alpha = 0.2) + xlim(1, 8)
p4 + theme_bw()
## Histogram cnn predict rating
p6 <- ggplot(dat, aes(x = xx)) +
geom_histogram(data = subset(dat, yy == 'b'), aes(y = ..density..), binwidth =.8, colour="black", fill = "white", alpha = 0.5) +
geom_density(data = subset(dat, yy == 'b'), colour = "#05bdc2", fill = "#05bdc2", alpha = 0.2) + xlim(1, 8)
p6 + theme_bw()
p7 <- ggplot(dat, aes(x = xx)) +
geom_histogram(data = subset(dat, yy == 'a'), aes(y = ..density..), fill = "#f7746c", alpha = 0.5) +
geom_histogram(data = subset(dat, yy == 'b'), aes(y = ..density..), fill = "#05bdc2", alpha = 1)
p7 + theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
# p4 + stat_function(fun = dnorm, args = list(mean = mean(cnn_rate), sd = sd(cnn_rate)), lwd = 2, color = "red")
p5 <- ggplot(dat, aes(x = xx)) +geom_density(data = subset(dat, yy == 'a'), fill = NA, colour = "#f7746c", alpha = 1) + geom_density(data = subset(dat, yy == 'b'), fill = NA, colour = "#05bdc2", alpha = 1) + xlim(1,8) + labs(x = "rate")
p5 + theme_bw()
n = c(true_rate,cnn_rate)
ggplot(new_dat, aes(new_dat$V2), colour = "red") + geom_density(fill = "red", alpha = 0.2) + xlim(1, 8)
ggplot(new_dat, aes(new_dat$V4)) + geom_density(alpha = 0.1) + xlim(1, 8)
## Warning: Removed 1 rows containing non-finite values (stat_density).